{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "\n",
    "import os\n",
    "import cjieba\n",
    "from gensim.corpora import WikiCorpus\n",
    "from gensim.models import word2vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(action='ignore', category=FutureWarning)\n",
    "import logging\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Wikimedia语料库下载地址：https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2\n",
    "fname = './data/zhwiki-latest-pages-articles.xml.bz2'\n",
    "wiki = WikiCorpus(fname, lemmatize=False, dictionary={}, article_min_tokens=10, token_min_len=1, token_max_len=100, lower=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 提取文本耗时25分钟\n",
    "if not os.path.exists('./data/zhwiki.txt'):\n",
    "    texts_num = 0\n",
    "    with open('./data/zhwiki.txt', 'w', encoding='utf-8') as f:\n",
    "        for text in wiki.get_texts():\n",
    "            f.write(' '.join(text) + '\\n')\n",
    "            texts_num += 1\n",
    "            if texts_num % 10000 == 0:\n",
    "                logging.info(\"已处理 %d 篇文章\" % texts_num)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 繁体转简体耗时2分钟\n",
    "if not os.path.exists('./data/zhwiki_simp.txt'):\n",
    "    '''! opencc -i ./data/zhwiki.txt -o ./data/zhwiki_simp.txt -c zht2zhs.ini'''\n",
    "    ! opencc -i ./data/zhwiki.txt -o ./data/zhwiki_simp.txt -c t2s.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 分词耗时11分钟\n",
    "if not os.path.exists('./data/zhwiki_simp_seg.txt'):\n",
    "    texts_num = 0\n",
    "    f_simp_seg = open('./data/zhwiki_simp_seg.txt', 'w', encoding='utf-8')\n",
    "    with open('./data/zhwiki_simp.txt', 'r', encoding='utf-8') as f:\n",
    "        text = f.readline()\n",
    "        while text:\n",
    "            f_simp_seg.write(' '.join([word for word in cjieba.cut(text.strip(), cut_all=False) if word != ' ']) + '\\n')\n",
    "            text = f.readline()\n",
    "            texts_num += 1\n",
    "            if texts_num % 10000 == 0:\n",
    "                logging.info(\"已处理 %d 篇文章\" % texts_num)\n",
    "    logging.info(\"共处理 %d 篇文章\" % texts_num)\n",
    "    f_simp_seg.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 训练word2vec模型耗时50分钟\n",
    "if not os.path.exists('./data/model/'):\n",
    "    os.makedirs('./data/model/')\n",
    "if not os.path.exists('./data/model/word2vec.model'):\n",
    "    sentences = word2vec.LineSentence('./data/zhwiki_simp_seg.txt')\n",
    "    w2v_model = word2vec.Word2Vec(sentences, size=200, workers=4)\n",
    "    w2v_model.save('./data/model/word2vec.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2018-09-06 20:59:36,339 : INFO : loading Word2Vec object from ./data/model/word2vec.model\n",
      "2018-09-06 20:59:39,088 : INFO : loading wv recursively from ./data/model/word2vec.model.wv.* with mmap=None\n",
      "2018-09-06 20:59:39,089 : INFO : loading vectors from ./data/model/word2vec.model.wv.vectors.npy with mmap=None\n",
      "2018-09-06 20:59:39,408 : INFO : setting ignored attribute vectors_norm to None\n",
      "2018-09-06 20:59:39,409 : INFO : loading vocabulary recursively from ./data/model/word2vec.model.vocabulary.* with mmap=None\n",
      "2018-09-06 20:59:39,410 : INFO : loading trainables recursively from ./data/model/word2vec.model.trainables.* with mmap=None\n",
      "2018-09-06 20:59:39,411 : INFO : loading syn1neg from ./data/model/word2vec.model.trainables.syn1neg.npy with mmap=None\n",
      "2018-09-06 20:59:41,232 : INFO : setting ignored attribute cum_table to None\n",
      "2018-09-06 20:59:41,234 : INFO : loaded ./data/model/word2vec.model\n"
     ]
    }
   ],
   "source": [
    "w2v_model = word2vec.Word2Vec.load(\"./data/model/word2vec.model\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2018-09-06 20:59:43,693 : INFO : precomputing L2-norms of word weight vectors\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('王室', 0.8594831824302673),\n",
       " ('王储', 0.8557029962539673),\n",
       " ('王后', 0.8546224236488342)]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# w2v_model.wv.most_similar(positive=['女人', '国王'], negative=['男人'], topn=3)\n",
    "w2v_model.wv.most_similar_cosmul(positive=['女人', '国王'], negative=['男人'], topn=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'谷物'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "w2v_model.wv.doesnt_match('早餐 谷物 晚餐 午餐'.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.88024735"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "w2v_model.wv.similarity('女人', '男人')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('狗', 0.7673438787460327),\n",
       " ('犬', 0.710284948348999),\n",
       " ('老鼠', 0.675322413444519)]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "w2v_model.wv.similar_by_word('猫', topn=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7762486040592194"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "w2v_model.wv.distance('媒体', '介质')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6433907"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 先对每个list的词向量求均值，再计算之间的余弦相似度\n",
    "w2v_model.wv.n_similarity(['寿司', '商店'], ['日语', '餐厅'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 6.83923736e-02,  1.02329269e-01,  9.71030537e-03, -1.61030665e-02,\n",
       "        2.08849721e-02,  5.80555238e-02,  1.18015781e-01, -2.75380518e-02,\n",
       "       -8.45042020e-02, -9.69247222e-02, -1.09279506e-01,  9.02274251e-02,\n",
       "       -3.59832570e-02,  3.35633988e-03, -2.02721311e-03,  1.12083957e-01,\n",
       "       -1.46189081e-02, -1.16647176e-01,  7.64922723e-02, -3.64502408e-02,\n",
       "        2.46583018e-02,  5.00709936e-02, -8.83321166e-02,  9.60599184e-02,\n",
       "        3.13817635e-02, -1.01580143e-01, -6.54485747e-02, -2.66069286e-02,\n",
       "       -1.54746529e-02,  4.91613820e-02, -2.65230499e-02, -2.19071452e-02,\n",
       "       -2.41167005e-02, -1.05453776e-02,  8.22560042e-02, -4.12925072e-02,\n",
       "        3.28649469e-02,  4.58846763e-02, -1.49507686e-01, -3.95058990e-02,\n",
       "       -3.14263552e-02,  6.07363358e-02,  1.21501267e-01,  9.03314203e-02,\n",
       "        4.87113036e-02,  2.88334265e-02,  4.36801165e-02, -1.17368706e-01,\n",
       "       -8.89689997e-02, -3.12791653e-02,  3.64994183e-02,  1.75368637e-02,\n",
       "        2.99781114e-02, -3.12015712e-02, -1.14624031e-01, -3.65493372e-02,\n",
       "       -1.56691913e-02, -6.67126179e-02, -7.42331613e-05, -5.11623695e-02,\n",
       "       -5.93180656e-02, -9.39296484e-02, -1.18826060e-02, -1.80143937e-02,\n",
       "       -9.78198834e-03,  2.61735693e-02,  5.02700508e-02, -2.53586620e-02,\n",
       "        1.74968038e-03,  5.28208865e-03,  7.60181807e-03, -9.39847901e-03,\n",
       "        1.01099610e-01, -1.08966529e-01, -3.08593418e-02,  1.32627841e-02,\n",
       "       -5.70743829e-02,  1.01185225e-01,  4.04534340e-02, -1.86255164e-02,\n",
       "        1.87837593e-02, -9.48520377e-02, -6.83775079e-03, -7.96126761e-03,\n",
       "       -8.31983797e-03,  4.19832282e-02,  1.18058421e-01,  2.67528147e-01,\n",
       "        2.04197075e-02, -9.73343253e-02,  1.53471269e-02,  1.57640595e-02,\n",
       "        1.72226101e-01, -4.80357222e-02, -2.12232303e-02,  1.07961543e-01,\n",
       "       -8.60295519e-02, -4.10360247e-02,  6.77962378e-02,  8.70392770e-02,\n",
       "        4.98127528e-02, -1.38113648e-01,  6.87899441e-02, -1.59582235e-02,\n",
       "        2.39346903e-02, -2.09674006e-03,  6.41022995e-02, -1.06809763e-04,\n",
       "       -2.08450889e-04,  6.50506690e-02,  1.30469957e-02,  8.44613649e-03,\n",
       "        3.02947387e-02,  1.58272833e-02,  1.85025468e-01,  7.65480101e-02,\n",
       "       -1.09313160e-01, -3.99225317e-02,  8.86313692e-02, -7.05067515e-02,\n",
       "        2.78081018e-02, -9.63663831e-02, -7.09380433e-02, -1.13573354e-02,\n",
       "        6.00832421e-03,  8.78628269e-02, -5.98827712e-02, -9.23301056e-02,\n",
       "       -4.50320076e-03, -5.63986599e-02,  4.53348458e-02,  3.78738306e-02,\n",
       "        5.99536076e-02, -1.28052041e-01,  4.04285975e-02, -3.78756784e-02,\n",
       "        3.12889293e-02,  9.38403755e-02, -4.25751545e-02,  6.59759119e-02,\n",
       "        4.35056053e-02,  5.18296696e-02,  1.82603579e-02, -6.49777800e-02,\n",
       "        7.00675771e-02, -4.92704809e-02, -1.68415904e-01,  5.23139499e-02,\n",
       "        3.83700244e-02,  1.07967414e-01, -7.43735814e-03, -7.37644657e-02,\n",
       "       -1.24872595e-01,  5.78424297e-02, -2.38649244e-03, -1.53102409e-02,\n",
       "       -6.11076914e-02,  2.24426929e-02, -6.92097179e-04, -2.30257772e-02,\n",
       "       -1.48180230e-02, -3.36741880e-02,  7.74893984e-02,  8.07727277e-02,\n",
       "       -2.20492785e-03,  1.27517328e-01,  9.44450274e-02,  3.36121321e-02,\n",
       "       -1.96443759e-02,  4.90238965e-02, -2.46548336e-02,  1.26726747e-01,\n",
       "       -3.39177959e-02,  7.20960572e-02, -9.15392563e-02,  8.50006379e-03,\n",
       "       -1.22108171e-02,  1.48893744e-02,  1.32497773e-01,  1.41326830e-01,\n",
       "        6.87110648e-02, -1.56670183e-01, -5.35816560e-03,  2.83910688e-02,\n",
       "       -1.06016686e-02,  9.26861390e-02,  1.66620925e-01,  1.07195437e-01,\n",
       "       -3.71209206e-03,  6.45311922e-02, -3.53275687e-02,  1.83124207e-02,\n",
       "       -6.20843086e-04, -1.73154816e-01,  4.12612334e-02, -3.11823236e-03,\n",
       "        5.11366613e-02,  1.47383744e-02,  4.34940830e-02,  4.77703400e-02],\n",
       "      dtype=float32)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# w2v_model.wv['电脑']\n",
    "w2v_model.wv.word_vec('办公室', use_norm=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
